library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.1.8
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(dplyr)
library(stringr)
library(knitr)
books <- read_csv("data/books.csv")
## Rows: 11123 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): title, authors, isbn, isbn13, language_code, publication_date, publ...
## dbl (6): rowid, bookID, average_rating, num_pages, ratings_count, text_revie...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(books)
## Rows: 11,123
## Columns: 13
## $ rowid <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, …
## $ bookID <dbl> 1, 2, 4, 5, 8, 9, 10, 12, 13, 14, 16, 18, 21, 22, 2…
## $ title <chr> "Harry Potter and the Half-Blood Prince (Harry Pott…
## $ authors <chr> "J.K. Rowling/Mary GrandPré", "J.K. Rowling/Mary Gr…
## $ average_rating <dbl> 4.57, 4.49, 4.42, 4.56, 4.78, 3.74, 4.73, 4.38, 4.3…
## $ isbn <chr> "0439785960", "0439358078", "0439554896", "04396554…
## $ isbn13 <chr> "9780439785969", "9780439358071", "9780439554893", …
## $ language_code <chr> "eng", "eng", "eng", "eng", "eng", "en-US", "eng", …
## $ num_pages <dbl> 652, 870, 352, 435, 2690, 152, 3342, 815, 815, 215,…
## $ ratings_count <dbl> 2095690, 2153167, 6333, 2339585, 41428, 19, 28242, …
## $ text_reviews_count <dbl> 27591, 29221, 244, 36325, 164, 1, 808, 254, 4080, 4…
## $ publication_date <chr> "9/16/2006", "9/1/2004", "11/1/2003", "5/1/2004", "…
## $ publisher <chr> "Scholastic Inc.", "Scholastic Inc.", "Scholastic",…
The variable names are:
names(books)
## [1] "rowid" "bookID" "title"
## [4] "authors" "average_rating" "isbn"
## [7] "isbn13" "language_code" "num_pages"
## [10] "ratings_count" "text_reviews_count" "publication_date"
## [13] "publisher"
janitor, and save data under a new
namebooks_cleaned <- janitor::clean_names(books)
#glimpse(books)
NA values:books_cleaned %>%
summarise(across(rowid:publisher, ~sum(is.na(.x))))
There are no NA values!
most_titled_pub <- books_cleaned %>%
group_by(publisher) %>%
summarise(title_count = (n_distinct(title))) %>%
arrange(desc(title_count)) %>%
head(1)
most_titled_pub
avg_overall_rating <- books_cleaned %>%
group_by(publisher) %>%
mutate(avg_overall_rating = (mean(average_rating))) %>%
arrange(desc(avg_overall_rating)) %>%
ungroup() %>%
select(publisher, avg_overall_rating)
avg_overall_rating
books_cleaned %>%
group_by(publisher) %>%
summarise(language_count = n_distinct(language_code)) %>%
arrange(desc(language_count)) %>%
filter(language_count >1)
books_cleaned %>%
select(publisher, language_code) %>%
filter(publisher == "Oxford University Press USA") %>%
distinct(language_code, publisher) #to remove duplicate entries
str_sub() function:books_years <- books_cleaned %>%
mutate(publication_year = str_sub(publication_date, -4),
.after = publication_date) %>%
arrange(publication_year)
books_years
books_years %>%
mutate(publication_year = as.integer(publication_year)) %>%
select("publication_year", "title", "average_rating") %>%
arrange(publication_year, desc(average_rating)) %>%
group_by(publication_year) %>%
slice_max(average_rating, n=1)
books_years %>%
group_by(publication_year) %>%
summarise(books_per_year = n_distinct(title)) %>%
arrange(publication_year) #%>%
#arrange(desc(books_per_year)) #to check year with most ratings
url_ISBN13 <- "https://blog-cdn.reedsy.com/directories/admin/attachments/large_How-to-read-an-ISBN-2d4aca.jpg"
include_graphics(url_ISBN13)
Since the 5-7th characters denote the publisher, we could use these to identify the publisher for each book. This would negate the issue of having duplicated publisher entries with slightly different names.
First, confirm that all ISBN-13 codes are actually 13 characters long, so therefore we can assume the 5-7th characters represent the publisher:
books_cleaned %>%
mutate(isbn13_count = nchar(isbn13)) %>%
filter(isbn13_count != 13)
Therefore all ISBN13 codes are 13 characters long, so assume they follow the described format.
Next, split ISBN13 codes up, to get 5-7th character, and save
this as a new variable called isbn13_publisher
books_isbn_pub <- books_cleaned %>%
mutate(isbn13_publisher = substr(isbn13, 5, 7),
.after = publisher) %>%
arrange(publisher)
books_isbn_pub
books_isbn13_pub_grouped <- books_isbn_pub %>%
group_by(publisher) %>%
summarise(count_isbn_pub = n_distinct(isbn13_publisher)) %>%
arrange(desc(count_isbn_pub)) %>%
filter(count_isbn_pub >1)
books_isbn13_pub_grouped
There are 444 publishers that have more than one identifier in the ISBN13 codes, so unfortunately using the ISBN13 code wouldn’t clear up the duplicate publisher issue, and might make things more complicated.
A quick dive to try to find a reason - could it be language, authors, date related? Look at a publisher with >1 codes called “HarperCollins”
books_isbn_pub %>%
filter(publisher == "HarperCollins") %>%
arrange(isbn13_publisher)